from __future__ import division

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np

from PIL import Image

from utils.parse_config import * # 导入文件夹下一个py文件的全部方法
from utils.utils import build_targets
from collections import defaultdict

import matplotlib.pyplot as plt
import matplotlib.patches as patches

def create_modules(module_defs):
    """
    Constructs module list of layer blocks from module configuration in module_defs
    传入值为模型参数, 为一个list
    """
    hyperparams = module_defs.pop(0)# 将module_defs提取出第一个dict,注意原本module_defs也会发生改变，第一个元素为cfg文件中net下的数据
    output_filters = [int(hyperparams['channels'])]#第一层卷基层的输入为3层，这里每次添加层的输出filters作为下一层的输入
    module_list = nn.ModuleList()# 和一般的list一样不过用来存放模型参数

    # 将模型加载为可以使用的模型，ModuleList存放所有层层的参数，Sequential用来存储一层的参数
    for i, module_def in enumerate(module_defs):
        # 遍历emumerate,i表示第几个参数,module_def表示value
        modules = nn.Sequential()#一个深度学习框架的序列
        # 如果是卷积层那么先添加一个卷积层，再根据参数添加batch_normalization和leakyReLU
        if module_def['type'] == 'convolutional':
            # 构建卷积层
            bn = int(module_def['batch_normalize'])
            filters = int(module_def['filters'])
            kernel_size = int(module_def['size'])
            # padding的值为(kernel_size-1)//2
            pad = (kernel_size - 1) // 2 if int(module_def['pad']) else 0# convolution中的pad参数只说明是否pad不提供pad大小
            modules.add_module('conv_%d' % i, nn.Conv2d(in_channels=output_filters[-1],
                                                        out_channels=filters,
                                                        kernel_size=kernel_size,
                                                        stride=int(module_def['stride']),
                                                        padding=pad,
                                                        bias=not bn))
            # 这里注意输入的不是一个图片，所以在torch文档中会有四维torch，in_channel为每一张图片的深度
            if bn:
                modules.add_module('batch_norm_%d' % i, nn.BatchNorm2d(filters))# 如果bn为1, 那么在最后添加一个batch normalization，filters是指要进行batch normalization的tensor深度
            if module_def['activation'] == 'leaky':
                modules.add_module('leaky_%d' % i, nn.LeakyReLU(0.1))# 用LeakyReLU作为激活函数

        elif module_def['type'] == 'upsample':
            # 添加一层上采样
            upsample = nn.Upsample( scale_factor=int(module_def['stride']),
                                    mode='nearest')# 上采样，该句是说根据最近的上采样，最终维度扩展scale_factor
            modules.add_module('upsample_%d' % i, upsample)

        elif module_def['type'] == 'route':
            # 路由层，当只有一个值时索引之前的某一层，当有两个值时将之前的两层拼接作为输出
            layers = [int(x) for x in module_def["layers"].split(',')]
            filters = sum([output_filters[layer_i] for layer_i in layers])
            modules.add_module('route_%d' % i, EmptyLayer())# 添加一层空层

        elif module_def['type'] == 'shortcut':
            # 跳过链接，类似于残差网络，本层输出是由之前层拼接而成
            filters = output_filters[int(module_def['from'])]
            modules.add_module("shortcut_%d" % i, EmptyLayer())

        elif module_def["type"] == "yolo":
            anchor_idxs = [int(x) for x in module_def["mask"].split(",")]
            # Extract anchors
            # 选某几种anchor box
            anchors = [int(x) for x in module_def["anchors"].split(",")]
            anchors = [(anchors[i], anchors[i+1]) for i in range(0, len(anchors),2)]
            anchors = [anchors[i] for i in anchor_idxs]# 取出mask中对应的anchors，每一个anchors对应一个()
            num_classes = int(module_def['classes'])
            img_height = int(hyperparams['height'])# 输入图片的大小
            # Define detection layer
            yolo_layer = YOLOLayer(anchors, num_classes, img_height)
            modules.add_module('yolo_%d' % i, yolo_layer)
        # Register module list and number of output filters
        module_list.append(modules)
        output_filters.append(filters)# 将每一层输出的filter添加到list中

    return hyperparams, module_list

class EmptyLayer(nn.Module):
    """Placeholder for 'route' and 'shortcut' layers"""
    # 执行拼接操作，为了方便创建一个空层，只要输出的filter拼接起来即可
    def __init__(self):
        super(EmptyLayer, self).__init__()

class YOLOLayer(nn.Module):
    """Detection layer"""
    def __init__(self, anchors, num_classes, img_dim):
        super(YOLOLayer, self).__init__()
        self.anchors = anchors# 预测anchor box的大小，该实现中使用三个anchor box
        self.num_anchors = len(anchors)
        self.num_classes = num_classes
        self.bbox_attrs = 5 + num_classes# bbox_attrs是指每个box上的参数个数，有中心点坐标，box的长宽，预测的置信度，以及所属类别的可能性
        self.img_dim = img_dim
        self.ignore_thres = 0.5
        self.lambda_coord = 1

        # 创建优化loss function的方式
        self.mse_loss = nn.MSELoss()
        self.bce_loss = nn.BCELoss()

    def forward(self, x, targets=None):


        # Defines the computation performed at every call.
        # x为上一层的计算结果
        bs = x.size(0)# bs是图片批量处理的数量(一个epoch数量)
        g_dim = x.size(2)# g_dim指的是gird的个数
        stride =  self.img_dim / g_dim # 根据img_dim计算stride，之前输入层都是将图片pad到大小一致，yolo中是都pad到416
        # Tensors for cuda support
        # 根据x是否支持cuda来判断是否支持cuda
        FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
        LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
        # 这里进行view的原因是卷积层的输出为bs*n_c*g_dim*g_dim，直接无法处理，先reshape之后分成bs*num_anchor*bbox*g_dim*g_dim的形式
        # 注意reshape对二维矩阵可以理解为将矩阵拉成一行，再按照大小填充，这里保留最后两个维度，将其他维度拉成一条，再按照大小填充，
        # 这也是为什么不直接view成bs*num_anchor*g_dim*g_dim*bbox的形式
        prediction = x.view(bs,  self.num_anchors, self.bbox_attrs, g_dim, g_dim).permute(0, 1, 3, 4, 2).contiguous()
        # view相当于tensor的一个reshape的过程，x经过view之后维度变为bs*self.num_anchors*self.bbox_attrs*g_dim*g_dim，permute进行维度变换
        # continues() 返回一个内存连续有相同数据的tensor

        # Get outputs
        # torch.sigmoid()对tensor中的每个数据进行sigmoid计算
        # bbox_attrs至少有5个值
        # 将计算的数据分别提取出来
        # 因为x和y的坐标是针对gird来说，所以对其进行归一化，w,h也以gird的大小进行过归一化，可以大于1且恒大于0
        x = torch.sigmoid(prediction[..., 0])          # Center x 
        y = torch.sigmoid(prediction[..., 1])          # Center y
        w = prediction[..., 2]                         # Width
        h = prediction[..., 3]                         # Height
        # conf表示检测到的object的置信度，pred_cls表示object属于哪个类的置信度
        conf = torch.sigmoid(prediction[..., 4])       # Conf
        pred_cls = torch.sigmoid(prediction[..., 5:])  # Cls pred.

        # Calculate offsets for each grid
        grid_x = torch.linspace(0, g_dim-1, g_dim).repeat(g_dim,1).repeat(bs*self.num_anchors, 1, 1).view(x.shape).type(FloatTensor)
        # 目前grid_x中每个值为每个gird中左上角点的坐标
        # linspace返回一个在start和end之间step个点的值，如linspace(0,g_dim-1,g_dim)返回一个0到g_dim-1之间g_dim个数，0为开头一个，g_dim-1为最后一个(均匀产生)
        # repeat将原数据作为一组，复制g_dim行,1列
        grid_y = torch.linspace(0, g_dim-1, g_dim).repeat(g_dim,1).t().repeat(bs*self.num_anchors, 1, 1).view(y.shape).type(FloatTensor)
        # 与grid_x一样，只是进行转置
        # sacled_anchors是将标准高度即416下的anchor处理为现图像尺寸下的anchor
        scaled_anchors = [(a_w / stride, a_h / stride) for a_w, a_h in self.anchors]# 该方法得到的是一个list
        # 在公式中p_w的计算
        anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0]))
        # torch.FloatTensor()将list以每个tuple作为一行，tuple内的数作为行的内容，构造一个tensor
        # index_select选择tensor中的某几行(或列)，第一个参数用于指定选择哪个维度，第二个参数是一个longtensor指定当前维度的第几个
        anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1]))
        anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, g_dim*g_dim).view(w.shape)
        anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, g_dim*g_dim).view(h.shape)

        # Add offset and scale with anchors
        # perdition是上一层的输出
        pred_boxes = FloatTensor(prediction[..., :4].shape)# [...,:4]获取最后一个维度0到4的所有数据
        pred_boxes[..., 0] = x.data + grid_x
        pred_boxes[..., 1] = y.data + grid_y
        pred_boxes[..., 2] = torch.exp(w.data) * anchor_w # 做指数运算是为了保证w和h恒大于0
        pred_boxes[..., 3] = torch.exp(h.data) * anchor_h

        # Training
        if targets is not None:
            # target是图片中标记的目标
            # 判断是否可以使用cuda，如果可以使用那么就转换成cuda版本
            if x.is_cuda:
                self.mse_loss = self.mse_loss.cuda()
                self.bce_loss = self.bce_loss.cuda()

            nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes.cpu().data,
                                                                            targets.cpu().data,
                                                                            scaled_anchors,
                                                                            self.num_anchors,
                                                                            self.num_classes,
                                                                            g_dim,
                                                                            self.ignore_thres,
                                                                            self.img_dim)
            # nGT是指一批图片中一共有多少target
            # nCorrect是指成功预测的个数
            # mask是表示target对应哪一个尺寸anchor box，且位置在哪
            # conf_mask需要根据loss function来理解，小于ignore_thres的都认为是没有object在里面的
            # 大于ignore_thres的认为需要判断object是否在里面，也就是说conf_mask中为0的点只有大于ignore_thres且object不在（anchor，gird）的
            # tx是指target的偏移
            # ty
            # tw
            # th
            # tconf指定哪一个gird是
            # tcls最优的anchor box是哪一个class
            nProposals = int((conf > 0.25).sum().item())# 大于0.25置信度的个数，.item()只能取一个数的值
            recall = float(nCorrect / nGT) if nGT else 1# 准确识别的数据占所有目标的个数

            # Handle masks
            mask = Variable(mask.type(FloatTensor))# 反向传播中需要求导的数据用Variable封装
            cls_mask = Variable(mask.unsqueeze(-1).repeat(1, 1, 1, 1, self.num_classes).type(FloatTensor))# unsqueeze(-1)添加一维
            conf_mask = Variable(conf_mask.type(FloatTensor))

            # Handle target variables
            # 添加变量用于计算，这些说明在反向传播中是常数节点
            tx    = Variable(tx.type(FloatTensor), requires_grad=False)
            ty    = Variable(ty.type(FloatTensor), requires_grad=False)
            tw    = Variable(tw.type(FloatTensor), requires_grad=False)
            th    = Variable(th.type(FloatTensor), requires_grad=False)
            tconf = Variable(tconf.type(FloatTensor), requires_grad=False)
            tcls  = Variable(tcls.type(FloatTensor), requires_grad=False)

            # Mask outputs to ignore non-existing objects
            loss_x = self.lambda_coord * self.bce_loss(x * mask, tx * mask)
            loss_y = self.lambda_coord * self.bce_loss(y * mask, ty * mask)
            loss_w = self.lambda_coord * self.mse_loss(w * mask, tw * mask) / 2
            loss_h = self.lambda_coord * self.mse_loss(h * mask, th * mask) / 2
            loss_conf = self.bce_loss(conf * conf_mask, tconf * conf_mask)
            loss_cls = self.bce_loss(pred_cls * cls_mask, tcls * cls_mask)
            loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls

            return loss, loss_x.item(), loss_y.item(), loss_w.item(), loss_h.item(), loss_conf.item(), loss_cls.item(), recall

        else:
            # If not in training phase return predictions
            output = torch.cat((pred_boxes.view(bs, -1, 4) * stride, conf.view(bs, -1, 1), pred_cls.view(bs, -1, self.num_classes)), -1)
            return output.data


class Darknet(nn.Module):
    """YOLOv3 object detection model"""
    def __init__(self, config_path, img_size=416):
        super(Darknet, self).__init__()# 用父类的初始化方法来初始化父类的属性和方法
        # 创建深度学习框架
        self.module_defs = parse_model_config(config_path)# config路径传入,存储的为network框架的参数
        # 
        self.hyperparams, self.module_list = create_modules(self.module_defs)# hyperparams为将net提取出之后网络的参数，module_list为构建的torch框架
        self.img_size = img_size
        self.seen = 0
        self.header_info = np.array([0, 0, 0, self.seen, 0])
        self.loss_names = ['x', 'y', 'w', 'h', 'conf', 'cls', 'recall']

    def forward(self, x, targets=None):
        # 如果targets非none时，进行训练，传入的target为方框的中心点坐标和w, h
        is_training = targets is not None
        output = []
        self.losses = defaultdict(float)# 创建一个dict，与直接创建的区别为这个dict只要创建一个type那么就会有默认的值
        layer_outputs = []
        # 注意下边所有的module为Sequential，若为torch中原带的model可以直接输入且conv中包括多个层，若为yolo那么需要指定运行Sequential[0]
        for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
            if module_def['type'] in ['convolutional', 'upsample']:
                x = module(x)# 如果是convolutional或upsample直接跑卷积层即可
            elif module_def['type'] == 'route':
                layer_i = [int(x) for x in module_def['layers'].split(',')]
                x = torch.cat([layer_outputs[i] for i in layer_i], 1)# 将某几层的输出合并
            elif module_def['type'] == 'shortcut':
                layer_i = int(module_def['from'])
                x = layer_outputs[-1] + layer_outputs[layer_i]# 将某几层的输出求和
            elif module_def['type'] == 'yolo':
                # Train phase: get loss
                if is_training:
                    # yolo层输出的计算
                    x, *losses = module[0](x, targets)# 取Sequal的第一个模型
                    for name, loss in zip(self.loss_names, losses):
                        self.losses[name] += loss# 这里需要默认值采用defaultdict来计算
                # Test phase: Get detections
                else:
                    x = module(x)# 注意训练和不训练不同的调用方式
                output.append(x)# 在训练的时候output中存储着loss，即is_training下的x
            layer_outputs.append(x)

        self.losses['recall'] /= 3
        # 将yolo检测层的结果相加返回
        return sum(output) if is_training else torch.cat(output, 1)# 如果training的话直接求和否则将输出合并


    def load_weights(self, weights_path):
        """Parses and loads the weights stored in 'weights_path'"""


        # 权重文件只有batch_normalization和conv有
        #Open the weights file
        fp = open(weights_path, "rb")
        header = np.fromfile(fp, dtype=np.int32, count=5)   # First five are header values(取前五行)

        # Needed to write header when saving weights
        self.header_info = header

        self.seen = header[3]
        weights = np.fromfile(fp, dtype=np.float32)         # The rest are weights，rb为继续读，在之前读取的基础上往下读
        fp.close()

        ptr = 0
        # 这里相当于加载到了self的相关参数中
        for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
            if module_def['type'] == 'convolutional':
                conv_layer = module[0]# module的第一层为卷积层
                # 根据参数判断是否有其他的层添加
                # 如果有batch normalization先将batch normalization的权重全部加载，再只加载卷积层额权重；如果没有，那么先加载卷积层的偏置，再加载权重
                if module_def['batch_normalize']:
                    # Load BN bias, weights, running mean and running variance
                    bn_layer = module[1]
                    num_b = bn_layer.bias.numel() # Number of biases，即读取参数的数量
                    # Bias
                    bn_b = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.bias)# 将从权重文件中读到的weights加载为tensor然后通过view_as转换为bias
                    bn_layer.bias.data.copy_(bn_b)# 将括号内的内容复制到module[1]中，因为两个指向同一个地址
                    ptr += num_b
                    # Weight
                    bn_w = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.weight)# 
                    bn_layer.weight.data.copy_(bn_w)
                    ptr += num_b
                    # Running Mean
                    bn_rm = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.running_mean)
                    bn_layer.running_mean.data.copy_(bn_rm)
                    ptr += num_b
                    # Running Var
                    bn_rv = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.running_var)
                    bn_layer.running_var.data.copy_(bn_rv)
                    ptr += num_b
                else:
                    # Load conv. bias
                    num_b = conv_layer.bias.numel()
                    conv_b = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(conv_layer.bias)
                    conv_layer.bias.data.copy_(conv_b)
                    ptr += num_b
                # Load conv. weights
                num_w = conv_layer.weight.numel()
                conv_w = torch.from_numpy(weights[ptr:ptr + num_w]).view_as(conv_layer.weight)
                conv_layer.weight.data.copy_(conv_w)
                ptr += num_w

    """
        @:param path    - path of the new weights file
        @:param cutoff  - save layers between 0 and cutoff (cutoff = -1 -> all are saved)
    """
    def save_weights(self, path, cutoff=-1):

        fp = open(path, 'wb')
        self.header_info[3] = self.seen# 头不变
        self.header_info.tofile(fp)# 将numpy写入二进制文件

        # Iterate through layers
        for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])):
            if module_def['type'] == 'convolutional':
                conv_layer = module[0]
                # If batch norm, load bn first
                if module_def['batch_normalize']:
                    bn_layer = module[1]
                    bn_layer.bias.data.cpu().numpy().tofile(fp)# .cpu()将数据从cuda()转换为cpu()
                    bn_layer.weight.data.cpu().numpy().tofile(fp)
                    bn_layer.running_mean.data.cpu().numpy().tofile(fp)
                    bn_layer.running_var.data.cpu().numpy().tofile(fp)
                # Load conv bias
                else:
                    conv_layer.bias.data.cpu().numpy().tofile(fp)
                # Load conv weights
                conv_layer.weight.data.cpu().numpy().tofile(fp)

        fp.close()
